using System;
using System.Runtime.CompilerServices;
using size_t = nuint;

unsafe struct MatrixMulVanilla : IMatrixMulSimdSpecific
{
	public size_t BlockRows => 228;
	public size_t BlockCols => 128;
	public size_t KernelRows => 2;
	public size_t KernelCols => 2;

	public void PackBlock(double* dst, double* src, size_t src_rows, size_t src_cols, size_t src_stride)
	{
		PackBlockBy2Cols(dst, src, src_rows, src_cols, src_stride);
	}

	public void PackVPanel(double* dst, double* src, size_t src_rows, size_t src_cols, size_t src_stride)
	{
		PackVPanelBy2Rows(dst, src, src_rows, src_cols, src_stride);
	}

	public void MulKernelComplete(bool add, double* C, double* A, double* B, size_t A_cols, size_t C_stride)
	{
		MulKernel2x2(add, C, A, B, A_cols, C_stride);
	}

	[MethodImpl(MethodImplOptions.AggressiveInlining)]
	public void MulKernelIncomplete(bool add, double* C, double* A, double* B, size_t C_rows, size_t C_cols, size_t A_cols, size_t C_stride)
	{
		if (C_rows == 2)
			MulKernel2x1(add, C, A, B, A_cols, C_stride);
		else if (C_cols == 2)
			MulKernel2x1(add, C, B, A, A_cols, 1);  // trick: swap A, B
		else
			MulKernel1x1(add, C, A, B, A_cols);
	}

	[MethodImpl(MethodImplOptions.AggressiveOptimization)]
	static void PackVPanelBy2Rows(double* dst, double* src, size_t src_rows, size_t src_cols, size_t src_stride)
	{
		size_t i = 0;
		for (; i + 2 <= src_rows; i += 2)
		{
			double* src0 = src + (i + 0) * src_stride;
			double* src1 = src + (i + 1) * src_stride;

			for (size_t j = 0; j < src_cols; ++j)
			{
				*(dst + 0) = *(src0 + j);
				*(dst + 1) = *(src1 + j);
				dst += 2;
			}
		}
		if (i < src_rows)
		{
			double* src0 = src + i * src_stride;
			for (size_t j = 0; j < src_cols; ++j)
			{
				*dst = *(src0 + j);
				dst += 1;
			}
		}
	}

	[MethodImpl(MethodImplOptions.AggressiveOptimization)]
	static void PackBlockBy2Cols(double* dst, double* src, size_t src_rows, size_t src_cols, size_t src_stride)
	{
		size_t packed_rows = (src_cols + 1) / 2;
		double* dst_last_row = dst + (packed_rows - 1) * src_rows * 2;
		size_t last_row_step = src_cols % 2;

		for (size_t i = 0; i < src_rows; ++i)
		{
			double* src_row = src + i * src_stride;
			double* dst_col = dst + i * 2;

			size_t j = 0;
			for (; j + 2 <= src_cols; j += 2)
			{
				*(dst_col + 0) = *(src_row + j + 0);
				*(dst_col + 1) = *(src_row + j + 1);
				dst_col += src_rows * 2;
			}

			if (j < src_cols)
			{
				*dst_last_row = src_row[j];
				dst_last_row += last_row_step;
			}
		}
	}

	[MethodImpl(MethodImplOptions.AggressiveOptimization)]
	static void MulKernel2x2(bool add, double* C, double* A, double* B, size_t A_cols, size_t C_stride)
	{
		double sum00, sum10, sum01, sum11;
		sum00 = sum10 = sum01 = sum11 = 0;

		double* a0 = A, b0 = B;
		size_t A_cols_2 = A_cols & ~(size_t)1;
		size_t k = 0;
		for (; k < A_cols_2; k += 2)
		{
			double va, vb0, vb1;
			vb0 = b0[0];
			vb1 = b0[1];
			va = a0[0];
			sum00 += va * vb0;
			sum01 += va * vb1;
			va = a0[1];
			sum10 += va * vb0;
			sum11 += va * vb1;
			vb0 = b0[2];
			vb1 = b0[3];
			va = a0[2];
			sum00 += va * vb0;
			sum01 += va * vb1;
			va = a0[3];
			sum10 += va * vb0;
			sum11 += va * vb1;
			a0 += 4;
			b0 += 4;
		}
		if (k < A_cols)
		{
			double va, vb0, vb1;
			vb0 = b0[0];
			vb1 = b0[1];
			va = a0[0];
			sum00 += va * vb0;
			sum01 += va * vb1;
			va = a0[1];
			sum10 += va * vb0;
			sum11 += va * vb1;
		}

		double* c0, c1;
		c0 = C + C_stride * 0;
		c1 = C + C_stride * 1;

		if (add)
		{
			sum00 += c0[0]; sum01 += c0[1];
			sum10 += c1[0]; sum11 += c1[1];
		}
		c0[0] = sum00; c0[1] = sum01;
		c1[0] = sum10; c1[1] = sum11;
	}

	[MethodImpl(MethodImplOptions.AggressiveOptimization)]
	static void MulKernel2x1(bool add, double* C, double* A, double* B, size_t A_cols, size_t C_stride)
	{
		double sum00x, sum10x, sum00y, sum10y;

		sum00x = sum10x = sum00y = sum10y = 0;

		double* a0 = A, b0 = B;
		size_t A_cols_2 = A_cols & ~(size_t)1;
		size_t k = 0;
		for (; k < A_cols_2; k += 2)
		{
			double vb0;
			vb0 = b0[0];
			sum00x += a0[0] * vb0;
			sum10x += a0[1] * vb0;
			vb0 = b0[1];
			sum00y += a0[2] * vb0;
			sum10y += a0[3] * vb0;
			a0 += 4;
			b0 += 2;
		}
		if (k < A_cols)
		{
			double vb0;
			vb0 = b0[0];
			sum00x += a0[0] * vb0;
			sum10x += a0[1] * vb0;
		}

		double sum00 = sum00x + sum00y;
		double sum10 = sum10x + sum10y;

		double* c0, c1;
		c0 = C + C_stride * 0;
		c1 = C + C_stride * 1;

		if (add)
		{
			sum00 += c0[0];
			sum10 += c1[0];
		}
		c0[0] = sum00;
		c1[0] = sum10;
	}

	[MethodImpl(MethodImplOptions.AggressiveOptimization)]
	static void MulKernel1x1(bool add, double* C, double* A, double* B, size_t A_cols)
	{
		double sum00x, sum00y, sum00z, sum00w;
		sum00x = sum00y = sum00z = sum00w = 0;

		double* a0 = A, b0 = B;
		size_t A_cols_4 = A_cols & ~(size_t)3;
		size_t k = 0;
		for (; k < A_cols_4; k += 4)
		{
			sum00x += a0[0] * b0[0];
			sum00y += a0[1] * b0[1];
			sum00z += a0[2] * b0[2];
			sum00w += a0[3] * b0[3];
			a0 += 4;
			b0 += 4;
		}
		for (; k < A_cols; ++k)
		{
			sum00x += a0[0] * b0[0];
			a0 += 1;
			b0 += 1;
		}

		double sum00 = (sum00x + sum00y) + (sum00z + sum00w);

		if (add)
		{
			sum00 += *C;
		}
		*C = sum00;
	}

	public void Dispose() { }
}
